Overview
This analysis explores a wheat dataset containing yield information
across different countries, locations, and time periods. The primary
objectives are:
- Perform comprehensive data cleaning and quality assessment
- Critically evaluate predictor variable availability for ML
applications
- Create interactive visualizations to understand geographic and
temporal patterns
- Assess data completeness across all 108 variables and identify
missing value patterns
- Provide realistic ML readiness assessment considering both
target and predictor variables
- Recommend feasible analytical approaches given data
limitations
Load Required Libraries
library(data.table) # For fast file reading
library(tidyverse) # Data manipulation
library(knitr) # Tables
library(ggplot2) # Static plots only
library(scales) # For formatting
library(leaflet)
library(gt)
library(kableExtra)
library(plotly) # Interactive plots
library(furrr) # Parallel mapping functions
library(VIM) # Missing data visualization
Data Loading and Initial Exploration
# Load wheat data with irrigation and rainfall calendar information
# Note: fileEncoding handles special characters in country/location names
data <- read.csv("../../../data/Data/23-05-mergedwheatdata.csv", fileEncoding = "latin1")
# Initial data inspection
cat("Dataset dimensions:", dim(data), "\n")
## Dataset dimensions: 7635698 108
cat("Number of countries:", length(unique(data$Country)), "\n")
## Number of countries: 40
cat("Number of variables:", ncol(data), "\n")
## Number of variables: 108
cat("Time period covered:", range(data$Observation.period, na.rm = TRUE), "\n")
## Time period covered: 1980 2022
# Display all column names for reference
cat("Total variables in dataset:", length(names(data)), "\n")
## Total variables in dataset: 108
cat("First 20 variables:\n")
## First 20 variables:
head(names(data), 20)
## [1] "id" "Data.ID"
## [3] "Location" "State.Region.County.Province"
## [5] "Country" "Continent"
## [7] "Latitude..N.S." "Longitude..E.W."
## [9] "Conversion.for.latitude" "Conversion.for.longitude"
## [11] "Location.source" "Observation.period"
## [13] "Wheat.Type" "Crop.variety"
## [15] "Tillage.type" "Planting.date"
## [17] "Harvesting.date" "Flowering.stage"
## [19] "Treatment" "Treatment.type"
Critical Missing Data Analysis
Comprehensive Variable Availability Assessment
# Get comprehensive missing data statistics for all 108 variables
variable_completeness <- data %>%
summarise(across(everything(), ~sum(!is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "n_available") %>%
mutate(
total_observations = nrow(data),
percent_available = round((n_available / total_observations) * 100, 1),
ml_usability = case_when(
percent_available >= 80 ~ "Highly usable (≥80%)",
percent_available >= 50 ~ "Moderately usable (50-79%)",
percent_available >= 20 ~ "Limited use (20-49%)",
percent_available >= 5 ~ "Rarely available (5-19%)",
TRUE ~ "Essentially missing (<5%)"
)
) %>%
arrange(desc(percent_available))
# Display summary of variable availability
variable_completeness %>%
count(ml_usability) %>%
mutate(percentage = round(n / sum(n) * 100, 1)) %>%
gt() %>%
tab_header(title = "Variable Availability Summary Across All 108 Variables") %>%
cols_label(
ml_usability = "Usability Category",
n = "Number of Variables",
percentage = "% of All Variables"
) %>%
data_color(
columns = ml_usability,
colors = scales::col_factor(
palette = c("darkred", "red", "orange", "yellow", "green"),
domain = c("Essentially missing (<5%)", "Rarely available (5-19%)",
"Limited use (20-49%)", "Moderately usable (50-79%)", "Highly usable (≥80%)")
)
)
| Variable Availability Summary Across All 108 Variables |
| Usability Category |
Number of Variables |
% of All Variables |
| Essentially missing (<5%) |
63 |
58.3 |
| Highly usable (≥80%) |
45 |
41.7 |
Top Available Variables for ML
# Identify which variables are actually usable for ML
usable_variables <- variable_completeness %>%
filter(percent_available >= 50) %>% # At least 50% availability
head(108)
usable_variables %>%
gt() %>%
tab_header(title = "Top 20 Most Complete Variables (≥50% Available)") %>%
cols_label(
variable = "Variable Name",
n_available = "Available Observations",
total_observations = "Total Possible",
percent_available = "% Available",
ml_usability = "ML Usability"
) %>%
data_color(
columns = percent_available,
colors = scales::col_numeric(
palette = c("red", "yellow", "green"),
domain = c(0, 100)
)
)
| Top 20 Most Complete Variables (≥50% Available) |
| Variable Name |
Available Observations |
Total Possible |
% Available |
ML Usability |
| Data.ID |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Location |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| State.Region.County.Province |
7635357 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Country |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Continent |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Latitude..N.S. |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Longitude..E.W. |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Location.source |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
| Treatment.type |
7580944 |
7635698 |
99.3 |
Highly usable (≥80%) |
| Treatment |
7574652 |
7635698 |
99.2 |
Highly usable (≥80%) |
| N.type |
7572856 |
7635698 |
99.2 |
Highly usable (≥80%) |
| Water.regime |
7566269 |
7635698 |
99.1 |
Highly usable (≥80%) |
| Category.5 |
7565903 |
7635698 |
99.1 |
Highly usable (≥80%) |
| Plastic.film.mulching |
7565879 |
7635698 |
99.1 |
Highly usable (≥80%) |
| Emissions..yes.no. |
7568521 |
7635698 |
99.1 |
Highly usable (≥80%) |
| SE...22 |
7558039 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Climate |
7560686 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Soil.texture |
7559055 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Category |
7558788 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Category.1 |
7558560 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Category.2 |
7558575 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Category.3 |
7558688 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Category.4 |
7558788 |
7635698 |
99.0 |
Highly usable (≥80%) |
| EFd.... |
7558846 |
7635698 |
99.0 |
Highly usable (≥80%) |
| Wheat.Type |
7520076 |
7635698 |
98.5 |
Highly usable (≥80%) |
| Crop.variety |
7518700 |
7635698 |
98.5 |
Highly usable (≥80%) |
| Pest.prescence....64 |
7512016 |
7635698 |
98.4 |
Highly usable (≥80%) |
| Harvesting.date |
7503341 |
7635698 |
98.3 |
Highly usable (≥80%) |
| Irrigation..mm. |
7508939 |
7635698 |
98.3 |
Highly usable (≥80%) |
| Soil.type |
7500346 |
7635698 |
98.2 |
Highly usable (≥80%) |
| P.rate..kg.P.ha.1. |
7495519 |
7635698 |
98.2 |
Highly usable (≥80%) |
| Tillage.type |
7485484 |
7635698 |
98.0 |
Highly usable (≥80%) |
| Planting.date |
7472831 |
7635698 |
97.9 |
Highly usable (≥80%) |
| Soil.depth..cm. |
7472564 |
7635698 |
97.9 |
Highly usable (≥80%) |
| Soil.organic.carbon.... |
7471803 |
7635698 |
97.9 |
Highly usable (≥80%) |
| P.type |
7478253 |
7635698 |
97.9 |
Highly usable (≥80%) |
| Pest.detected...65 |
7472892 |
7635698 |
97.9 |
Highly usable (≥80%) |
| Pest.severity.score.......66 |
7472875 |
7635698 |
97.9 |
Highly usable (≥80%) |
| N.fertilizer.management |
7471021 |
7635698 |
97.8 |
Highly usable (≥80%) |
| Straw.return |
7470697 |
7635698 |
97.8 |
Highly usable (≥80%) |
| Pest.prescence....68 |
7463998 |
7635698 |
97.8 |
Highly usable (≥80%) |
| SE...56 |
7463466 |
7635698 |
97.7 |
Highly usable (≥80%) |
| Pest.detected...69 |
7463754 |
7635698 |
97.7 |
Highly usable (≥80%) |
| Pest.prescence....72 |
7463858 |
7635698 |
97.7 |
Highly usable (≥80%) |
| Main.weed.detected |
7463862 |
7635698 |
97.7 |
Highly usable (≥80%) |
Essential Variables
# Define essential variable categories based on actual dataset variables
essential_categories <- list(
climate = c("temperature", "precipitation", "temp1", "temp2", "temp3", "temp4", "temp5",
"temp6", "temp7", "temp8", "temp9", "prc1", "prc2", "prc3", "prc4", "prc5",
"prc6", "prc7", "prc8", "prc9", "Climate", "Water.regime", "Mean.annual"),
soil = c("Soil.type", "Soil.depth", "Sand", "Silt", "Clay", "Soil.texture",
"Soil.organic.carbon", "TN", "C.N.ratio", "Soil.pH", "BD", "Soil_N"),
management = c("Tillage", "Planting", "Harvesting", "Treatment", "N.type", "N.rate",
"N.fertilizer", "P.type", "P.rate", "Irrigation", "Straw.return",
"Plastic.film.mulching", "Crop.variety", "Wheat.Type"),
geographic = c("Country", "Location", "State.Region", "Continent", "Latitude",
"Longitude", "Conversion.for", "Elevation", "AEZ", "X", "Y"),
temporal = c("Observation.period", "date", "start_date", "end_date", "rainfed_start",
"rainfed_end", "irr_start", "irr_end"),
target = c("Grain.yield", "yield"),
pest_weed = c("Pest", "weed", "severity", "abundance", "coverage"),
emissions = c("N2O", "Emissions", "PFPN", "ANE", "EFd")
)
# Check availability of essential variables by category
essential_assessment <- map_dfr(names(essential_categories), function(category) {
pattern <- paste(essential_categories[[category]], collapse = "|")
matching_vars <- variable_completeness %>%
filter(str_detect(variable, regex(pattern, ignore_case = TRUE)))
if(nrow(matching_vars) > 0) {
matching_vars %>%
mutate(category = category) %>%
slice_max(percent_available, n = 3) # Top 3 per category
} else {
tibble(variable = paste("No", category, "variables found"),
n_available = 0, total_observations = nrow(data),
percent_available = 0, ml_usability = "Missing",
category = category)
}
})
essential_assessment %>%
gt() %>%
tab_header(title = "Essential Agricultural Variables by Category - Based on Actual Dataset") %>%
cols_label(
category = "Variable Category",
variable = "Variable Name",
percent_available = "% Available",
ml_usability = "ML Usability"
) %>%
data_color(
columns = percent_available,
colors = scales::col_numeric(
palette = c("red", "yellow", "green"),
domain = c(0, 100)
)
)
| Essential Agricultural Variables by Category - Based on Actual Dataset |
| Variable Name |
n_available |
total_observations |
% Available |
ML Usability |
Variable Category |
| Water.regime |
7566269 |
7635698 |
99.1 |
Highly usable (≥80%) |
climate |
| Climate |
7560686 |
7635698 |
99.0 |
Highly usable (≥80%) |
climate |
| Mean.annual.precipitation..mm. |
142176 |
7635698 |
1.9 |
Essentially missing (<5%) |
climate |
| Soil.texture |
7559055 |
7635698 |
99.0 |
Highly usable (≥80%) |
soil |
| Soil.type |
7500346 |
7635698 |
98.2 |
Highly usable (≥80%) |
soil |
| Soil.depth..cm. |
7472564 |
7635698 |
97.9 |
Highly usable (≥80%) |
soil |
| Soil.organic.carbon.... |
7471803 |
7635698 |
97.9 |
Highly usable (≥80%) |
soil |
| Treatment.type |
7580944 |
7635698 |
99.3 |
Highly usable (≥80%) |
management |
| Treatment |
7574652 |
7635698 |
99.2 |
Highly usable (≥80%) |
management |
| N.type |
7572856 |
7635698 |
99.2 |
Highly usable (≥80%) |
management |
| Location |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| State.Region.County.Province |
7635357 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| Country |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| Continent |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| Latitude..N.S. |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| Longitude..E.W. |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| Location.source |
7635698 |
7635698 |
100.0 |
Highly usable (≥80%) |
geographic |
| Harvesting.date |
7503341 |
7635698 |
98.3 |
Highly usable (≥80%) |
temporal |
| Planting.date |
7472831 |
7635698 |
97.9 |
Highly usable (≥80%) |
temporal |
| Observation.period |
172970 |
7635698 |
2.3 |
Essentially missing (<5%) |
temporal |
| start_date |
172970 |
7635698 |
2.3 |
Essentially missing (<5%) |
temporal |
| end_date |
172970 |
7635698 |
2.3 |
Essentially missing (<5%) |
temporal |
| Grain.yield..tons.ha.1. |
172961 |
7635698 |
2.3 |
Essentially missing (<5%) |
target |
| Yield.scaled.N2O.emission..g.N.Mg.1. |
96 |
7635698 |
0.0 |
Essentially missing (<5%) |
target |
| Pest.prescence....64 |
7512016 |
7635698 |
98.4 |
Highly usable (≥80%) |
pest_weed |
| Pest.detected...65 |
7472892 |
7635698 |
97.9 |
Highly usable (≥80%) |
pest_weed |
| Pest.severity.score.......66 |
7472875 |
7635698 |
97.9 |
Highly usable (≥80%) |
pest_weed |
| Emissions..yes.no. |
7568521 |
7635698 |
99.1 |
Highly usable (≥80%) |
emissions |
| EFd.... |
7558846 |
7635698 |
99.0 |
Highly usable (≥80%) |
emissions |
| Cumulative.N2O.fluxes..kg.N.ha.1. |
94613 |
7635698 |
1.2 |
Essentially missing (<5%) |
emissions |
Data Cleaning and Preprocessing
Column Selection and Type Conversion
# Select key variables and rename for easier handling
# NOTE: This is now limited to only the most complete variables
yield <- data %>%
select(Country,
Observation.period,
Location,
Continent,
State.Region.County.Province,
longitude = Conversion.for.longitude, # Rename for clarity
latitude = Conversion.for.latitude, # Rename for clarity
yield_value = Grain.yield..tons.ha.1.) %>% # Main outcome variable
mutate(
# Convert character columns to appropriate numeric types
longitude = as.numeric(longitude),
latitude = as.numeric(latitude),
yield_value = as.numeric(yield_value),
Observation.period = as.numeric(Observation.period)
) %>%
filter(
# Remove problematic observations
!is.na(yield_value), # Remove missing yield values
!is.na(longitude), # Remove missing coordinates
!is.na(latitude),
yield_value <= 30, # Remove unrealistic yield outliers (>30 tons/ha)
nchar(as.character(Observation.period)) == 4 # Keep only 4-digit years
)
cat("Cleaned dataset dimensions:", dim(yield), "\n")
## Cleaned dataset dimensions: 172088 8
cat("Yield range:", range(yield$yield_value), "tons/ha\n")
## Yield range: 0.01673634 28.5 tons/ha
cat("Variables retained for analysis:", ncol(yield), "out of original", ncol(data), "\n")
## Variables retained for analysis: 8 out of original 108
Data Quality Summary
# Summary statistics for the cleaned dataset
summary(yield)
## Country Observation.period Location Continent
## Length:172088 Min. :1980 Length:172088 Length:172088
## Class :character 1st Qu.:2012 Class :character Class :character
## Mode :character Median :2012 Mode :character Mode :character
## Mean :2013
## 3rd Qu.:2017
## Max. :2022
## State.Region.County.Province longitude latitude
## Length:172088 Min. :-121.93 Min. :-37.82
## Class :character 1st Qu.:-109.09 1st Qu.: 34.53
## Mode :character Median : 115.50 Median : 34.53
## Mean : 30.61 Mean : 33.25
## 3rd Qu.: 115.50 3rd Qu.: 35.45
## Max. : 152.39 Max. : 60.70
## yield_value
## Min. : 0.01674
## 1st Qu.: 5.79828
## Median : 7.81000
## Mean : 6.77131
## 3rd Qu.: 7.83000
## Max. :28.50000
Realistic ML Readiness Assessment
CRITICAL LIMITATION: Predictor Variable
Scarcity
# Calculate realistic ML feasibility by country
realistic_ml_assessment <- data %>%
group_by(Country) %>%
summarise(
# Target variable quality (yield data)
n_yield_obs = sum(!is.na(Grain.yield..tons.ha.1.)),
yield_completeness = round(mean(!is.na(Grain.yield..tons.ha.1.)) * 100, 1),
# Predictor variable availability
total_variables = ncol(.) - 1, # Exclude country from count
variables_50pct_complete = sum(sapply(select(., -Country), function(x) mean(!is.na(x)) >= 0.5)),
variables_80pct_complete = sum(sapply(select(., -Country), function(x) mean(!is.na(x)) >= 0.8)),
# Key geographic/temporal predictors
has_coordinates = !is.na(first(Conversion.for.longitude)) & !is.na(first(Conversion.for.latitude)),
has_temporal_data = !is.na(first(Observation.period)),
.groups = 'drop'
) %>%
mutate(
# Predictor availability scores
predictor_50_score = round((variables_50pct_complete / total_variables) * 100, 1),
predictor_80_score = round((variables_80pct_complete / total_variables) * 100, 1),
# Realistic ML feasibility classification
realistic_ml_feasibility = case_when(
n_yield_obs >= 15 & predictor_80_score >= 20 ~ "Feasible with external data",
n_yield_obs >= 10 & predictor_50_score >= 15 ~ "Limited ML possible",
n_yield_obs >= 10 & has_coordinates & has_temporal_data ~ "Geographic/temporal only",
n_yield_obs >= 5 ~ "Descriptive analysis only",
TRUE ~ "Insufficient for any ML"
),
# Recommended approach
recommended_strategy = case_when(
realistic_ml_feasibility == "Feasible with external data" ~ "Integrate climate/soil databases + simple ML",
realistic_ml_feasibility == "Limited ML possible" ~ "Use available predictors + correlation analysis",
realistic_ml_feasibility == "Geographic/temporal only" ~ "Spatial interpolation + time series",
realistic_ml_feasibility == "Descriptive analysis only" ~ "Trend analysis + basic statistics",
TRUE ~ "Focus on data collection"
)
) %>%
arrange(desc(n_yield_obs))
# Display realistic assessment
realistic_ml_assessment %>%
head(15) %>%
select(Country, n_yield_obs, predictor_50_score, predictor_80_score,
realistic_ml_feasibility, recommended_strategy) %>%
gt() %>%
tab_header(title = "Realistic ML Feasibility Assessment - Considering Predictor Availability") %>%
cols_label(
Country = "Country",
n_yield_obs = "Yield Observations",
predictor_50_score = "Variables ≥50% Complete (%)",
predictor_80_score = "Variables ≥80% Complete (%)",
realistic_ml_feasibility = "Realistic ML Feasibility",
recommended_strategy = "Recommended Strategy"
) %>%
data_color(
columns = realistic_ml_feasibility,
colors = scales::col_factor(
palette = c("darkred", "red", "orange", "yellow", "lightgreen"),
domain = c("Insufficient for any ML", "Descriptive analysis only",
"Geographic/temporal only", "Limited ML possible", "Feasible with external data")
)
) %>%
cols_width(
recommended_strategy ~ px(200)
) %>%
tab_style(
style = cell_text(size = px(10)),
locations = cells_body(columns = recommended_strategy)
)
| Realistic ML Feasibility Assessment - Considering Predictor Availability |
| Country |
Yield Observations |
Variables ≥50% Complete (%) |
Variables ≥80% Complete (%) |
Realistic ML Feasibility |
Recommended Strategy |
| China |
95319 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| United States of America |
39900 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Mexico |
14262 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Kenya |
7053 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| United Kingdom |
6397 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Ethiopia |
3909 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Germany |
1191 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Turkey |
862 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
|
714 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Iran |
392 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Morocco |
384 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Uzbekistan |
276 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Algeria |
232 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Russia |
230 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
| Pakistan |
164 |
42.1 |
42.1 |
Feasible with external data |
Integrate climate/soil databases + simple ML |
Feasibility Distribution Analysis
# Analyze distribution of realistic ML feasibility
feasibility_summary <- realistic_ml_assessment %>%
count(realistic_ml_feasibility) %>%
mutate(
percentage = round(n / sum(n) * 100, 1),
realistic_ml_feasibility = factor(realistic_ml_feasibility,
levels = c("Insufficient for any ML", "Descriptive analysis only",
"Geographic/temporal only", "Limited ML possible", "Feasible with external data"))
)
ggplot(feasibility_summary, aes(x = realistic_ml_feasibility, y = n, fill = realistic_ml_feasibility)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = paste0(n, "\n(", percentage, "%)")),
vjust = 0.5, fontface = "bold", size = 3) +
scale_fill_manual(values = c("darkred", "red", "orange", "yellow", "lightgreen")) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
) +
labs(
title = "Realistic ML Feasibility Distribution",
subtitle = "Assessment includes both target variable quality AND predictor availability",
x = "ML Feasibility Category",
y = "Number of Countries"
)

Geographic Visualization
Interactive Leaflet Map
# Create color palette for yield values
pal <- colorNumeric(palette = "YlOrRd", domain = yield$yield_value)
# Create interactive map showing global wheat yield distribution
leaflet(yield) %>%
addTiles() %>% # Add base map tiles
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~pal(yield_value), # Color by yield value
popup = ~paste("Country:", Country, "<br>",
"Location:", Location, "<br>",
"Yield:", round(yield_value, 2), "tons/ha"),
radius = 3,
fillOpacity = 0.7
) %>%
addLegend("bottomright",
pal = pal,
values = ~yield_value,
title = "Yield (tons/ha)")